import os
import numpy as np
import pandas as pd
# TensorFlow
import tensorflow as tf
from tensorflow.keras import layers, models
# cv
import cv2
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib import cm
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
Training neural networks for automated diagnosis of pigmented skin lesions is hampered by the small size and lack of diversity of available datasets of dermatoscopic images. We tackle this problem by releasing the HAM10000 ("Human Against Machine with 10000 training images") dataset. We collected dermatoscopic images from different populations, acquired and stored by different modalities. The final dataset consists of 10015 dermatoscopic images which can serve as a training set for academic machine learning purposes. Cases include a representative collection of all important diagnostic categories in the realm of pigmented lesions: Actinic keratoses and intraepithelial carcinoma / Bowen's disease (akiec), basal cell carcinoma (bcc), benign keratosis-like lesions (solar lentigines / seborrheic keratoses and lichen-planus like keratoses, bkl), dermatofibroma (df), melanoma (mel), melanocytic nevi (nv) and vascular lesions (angiomas, angiokeratomas, pyogenic granulomas and hemorrhage, vasc).
More than 50% of lesions are confirmed through histopathology (histo), the ground truth for the rest of the cases is either follow-up examination (follow_up), expert consensus (consensus), or confirmation by in-vivo confocal microscopy (confocal). The dataset includes lesions with multiple images, which can be tracked by the lesion_id-column within the HAM10000_metadata file.
Due to upload size limitations, images are stored in two files:
* HAM10000_images_part1.zip (5000 JPEG files)
* HAM10000_images_part2.zip (5015 JPEG files)
The HAM10000 dataset served as the training set for the ISIC 2018 challenge (Task 3). The test-set images are available herein as ISIC2018_Task3_Test_Images.zip (1511 images), the official validation-set is available through the challenge website https://challenge2018.isic-archive.com/. The ISIC-Archive also provides a "Live challenge" submission site for continuous evaluation of automated classifiers on the official validation- and test-set.
Test-set evaluations of the ISIC 2018 challenge were compared to physicians on an international scale, where the majority of challenge participants outperformed expert readers: Tschandl P. et al., Lancet Oncol 2019
The test-set images were also used in a study comparing different methods and scenarios of human-computer collaboration: Tschandl P. et al., Nature Medicine 2020 Following corresponding metadata is available herein:
def Path_Tree(PATH, Extension):
Out = {}
sep = ' ' * 3
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
title = PATH.split('\\')[-1]
print(Style.RESET_ALL + Fore.BLUE + Style.NORMAL + '=' * (len(title) +1) + Style.RESET_ALL)
print(Back.BLACK + Fore.CYAN + Style.NORMAL + title+':'+ Style.RESET_ALL)
print(Style.RESET_ALL + Fore.BLUE + Style.NORMAL + '=' * (len(title) +1)+ Style.RESET_ALL)
i = 0
C = ['Red', 'Green', 'Yellow', 'Blue', 'Magenta', 'Cyan']*len(os.listdir(PATH))
for entry in os.listdir(PATH):
if os.path.isdir(PATH):
print('└──',BACK[C[i]] + Fore.BLACK + Style.NORMAL + entry+':'+ Style.RESET_ALL)
Sub = os.path.join (PATH, entry)
List = os.listdir(Sub)
List = [x for x in List if x.endswith(Extension)]
Out[entry] = List
print(2* sep, Fore.BLUE + Style.NORMAL +
'%i %s files:' % (len(List), List[0].split('.')[-1].upper()) + Style.RESET_ALL)
print(2* sep, ', '.join(List[:5]) + ', ...')
i+=1
return Out
Path = 'slatmd_mod'
Files_dict = Path_Tree(Path, '.jpg')
Labels_dict = {'akiec': 'Actinic Keratoses and Intraepithelial Carcinoma',
'bcc': 'Basal Cell Carcinoma',
'bkl': 'Benign Keratosis-like Lesions',
'df': 'Dermatofibroma',
'mel': 'Melanoma',
'nv': 'Melanocytic Nevi',
'vasc': 'Vascular Lesions'}
=========== slatmd_mod: =========== └── akiec: 327 JPG files: ISIC_0024329.jpg, ISIC_0024372.jpg, ISIC_0024418.jpg, ISIC_0024450.jpg, ISIC_0024463.jpg, ... └── bcc: 514 JPG files: ISIC_0024331.jpg, ISIC_0024332.jpg, ISIC_0024345.jpg, ISIC_0024360.jpg, ISIC_0024403.jpg, ... └── bkl: 1099 JPG files: ISIC_0024312.jpg, ISIC_0024324.jpg, ISIC_0024336.jpg, ISIC_0024337.jpg, ISIC_0024338.jpg, ... └── df: 115 JPG files: ISIC_0024318.jpg, ISIC_0024330.jpg, ISIC_0024386.jpg, ISIC_0024396.jpg, ISIC_0024553.jpg, ... └── mel: 1113 JPG files: ISIC_0024310.jpg, ISIC_0024313.jpg, ISIC_0024315.jpg, ISIC_0024323.jpg, ISIC_0024333.jpg, ... └── nv: 6705 JPG files: ISIC_0024306.jpg, ISIC_0024307.jpg, ISIC_0024308.jpg, ISIC_0024309.jpg, ISIC_0024311.jpg, ... └── vasc: 142 JPG files: ISIC_0024370.jpg, ISIC_0024375.jpg, ISIC_0024402.jpg, ISIC_0024475.jpg, ISIC_0024662.jpg, ...
batch_size = 128
Img_Height = 180
Img_Width = 180
train_ds = tf.keras.preprocessing.image_dataset_from_directory(directory= Path, validation_split=0.2, subset="training",
seed=123, image_size=(Img_Height, Img_Width),
batch_size=batch_size)
val_ds = tf.keras.preprocessing.image_dataset_from_directory(directory= Path, validation_split=0.2, subset="validation",
seed=123, image_size=(Img_Height, Img_Width),
batch_size=batch_size)
Found 10015 files belonging to 7 classes. Using 8012 files for training. Found 10015 files belonging to 7 classes. Using 2003 files for validation.
fig, ax = plt.subplots(5, 5 , figsize = (16, 16))
ax = ax.ravel()
class_names = train_ds.class_names
Colors = dict(zip(np.arange(len(Labels_dict.keys())), mcolors.TABLEAU_COLORS.values()))
for images, labels in train_ds.take(1):
for i in range(len(ax)):
_ = ax[i].imshow(images[i].numpy().astype("uint8"))
_ = ax[i].set_title(Labels_dict[class_names[labels[i]]],
fontweight='bold', fontsize = 10.5, color = Colors[labels[i].numpy()])
_ = ax[i].axis("off")
_ = ax[i].set_aspect(1)
A multi-layer perceptron (MLP) is a class of feedforward artificial neural network (ANN). The algorithm at each iteration uses the Cross-Entropy Loss to measure the loss, and then the gradient and the model update is calculated. At the end of this iterative process, we would reach a better level of agreement between test and predicted sets since the error would be lower from that of the first step.
model = models.Sequential([layers.experimental.preprocessing.Rescaling(1./255, input_shape=(Img_Height, Img_Width, 3)),
layers.Conv2D(16, 3, padding='same', activation='relu'),
layers.MaxPooling2D(),
layers.Conv2D(32, 3, padding='same', activation='relu'),
layers.MaxPooling2D(),
layers.Conv2D(64, 3, padding='same', activation='relu'),
layers.MaxPooling2D(),
layers.Flatten(),
layers.Dense(128, activation='relu'),
layers.Dense(len(Labels_dict))])
model.summary()
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True, expand_nested = True)
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= rescaling (Rescaling) (None, 180, 180, 3) 0 _________________________________________________________________ conv2d (Conv2D) (None, 180, 180, 16) 448 _________________________________________________________________ max_pooling2d (MaxPooling2D) (None, 90, 90, 16) 0 _________________________________________________________________ conv2d_1 (Conv2D) (None, 90, 90, 32) 4640 _________________________________________________________________ max_pooling2d_1 (MaxPooling2 (None, 45, 45, 32) 0 _________________________________________________________________ conv2d_2 (Conv2D) (None, 45, 45, 64) 18496 _________________________________________________________________ max_pooling2d_2 (MaxPooling2 (None, 22, 22, 64) 0 _________________________________________________________________ flatten (Flatten) (None, 30976) 0 _________________________________________________________________ dense (Dense) (None, 128) 3965056 _________________________________________________________________ dense_1 (Dense) (None, 7) 903 ================================================================= Total params: 3,989,543 Trainable params: 3,989,543 Non-trainable params: 0 _________________________________________________________________
Compiling and fitting the model
# Number of iterations
IT = 11
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
# Training the model
history = model.fit(train_ds, validation_data=val_ds, epochs=IT, verbose = 0)
def Search_List(Key, List): return [s for s in List if Key in s]
Metrics_Names = {'loss':'Loss', 'accuracy':'Accuracy', 'mae':'MAE', 'mse':'MSE', 'recall': 'Recall'}
def Table_modify(df, Metrics_Names = Metrics_Names):
df = df.rename(columns = Metrics_Names)
df = df.reindex(sorted(df.columns), axis=1)
df.insert(loc = 0, column = 'Iteration', value = np.arange(0, df.shape[0]), allow_duplicates=False)
return df
Validation_Table = Search_List('val_',history.history.keys())
Train_Table = list(set( history.history.keys()) - set(Validation_Table))
Validation_Table = pd.DataFrame(np.array([history.history[x] for x in Validation_Table]).T, columns = Validation_Table)
Train_Table = pd.DataFrame(np.array([history.history[x] for x in Train_Table]).T, columns = Train_Table)
Validation_Table.columns = [x.replace('val_','') for x in Validation_Table.columns]
Train_Table = Table_modify(Train_Table)
Validation_Table = Table_modify(Validation_Table)
# Train Set Score
score = model.evaluate(train_ds, batch_size = batch_size, verbose = 0)
score = pd.DataFrame(score, index = model.metrics_names).T
score.index = ['Train Set Score']
# Validation Set Score
Temp = model.evaluate(val_ds, batch_size = batch_size, verbose = 0)
Temp = pd.DataFrame(Temp, index = model.metrics_names).T
Temp.index = ['Validation Set Score']
score = score.append(Temp)
score.rename(columns= Metrics_Names, inplace = True)
score = score.reindex(sorted(score.columns), axis=1)
display(score.style.set_precision(4))
| Accuracy | Loss | |
|---|---|---|
| Train Set Score | 0.8389 | 0.4545 |
| Validation Set Score | 0.7424 | 0.7535 |
def Plot_history(history, PD, Title = False, metrics_names = [x.title() for x in model.metrics_names]):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=[0.6, 0.4],
specs=[[{"type": "scatter"},{"type": "table"}]])
# Left
Colors = ['OrangeRed', 'MidnightBlue', 'purple']
for j in range(len(metrics_names)):
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history[metrics_names[j]].values,
line=dict(color=Colors[j], width= 1.5), name = metrics_names[j]), 1, 1)
fig.update_layout(legend=dict(x=0, y=1.1, traceorder='reversed', font_size=12),
dragmode='select', plot_bgcolor= 'white', height=600, hovermode='closest',
legend_orientation='h')
fig.update_xaxes(range=[history.Iteration.min(), history.Iteration.max()],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
fig.update_yaxes(range=[0, PD['yLim']], showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
# Right
if not PD['Table_Rows'] == None:
ind = np.linspace(0, history.shape[0], PD['Table_Rows'], endpoint = False).round(0).astype(int)
ind = np.append(ind, history.index[-1])
history = history[history.index.isin(ind)]
T = history.copy()
T[metrics_names] = T[metrics_names].applymap(lambda x: '%.4e' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
TableColors = PD['TableColors']
fig.add_trace(go.Table(header=dict(values = list(history.columns), line_color=TableColors[0],
fill_color=TableColors[0], align=['center','center'], font=dict(color=TableColors[1], size=12), height=25),
columnwidth = PD['tablecolumnwidth'], cells=dict(values=Temp, line_color=TableColors[0],
fill=dict(color=[TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12,height=20)), 1, 2)
if Title != False:
fig.update_layout(plot_bgcolor= 'white',
title={'text': Title, 'x':0.46, 'y':0.94, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.show()
PD = dict(Table_Rows = 25, yLim = 1.2, tablecolumnwidth = [0.3, 0.4, 0.4], TableColors = ['Navy','White'])
Plot_history(Train_Table, Title = 'Train Set', PD = PD)
Plot_history(Validation_Table, Title = 'Validation Set', PD = PD)
Here, we only went through a few iterations; however, we need to train the model for more iterations to get more accurate results.